By Group 1: Ya Ting Hu & Zhen Tian
import warnings
warnings.filterwarnings("ignore")
Title: Haberman's Survival Data
Sources: (a) Donor: Tjen-Sien Lim (limt@stat.wisc.edu) (b) Date: March 4, 1999
Past Usage:
Relevant Information: The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.
Number of Instances: 306
Number of Attributes: 4 (including the class attribute)
Attribute Information:
Missing Attribute Values: None
We have decided to use Python together with Jupyter notebook for our project.
from numpy import unique
from pandas import read_csv
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/haberman.csv'
dataframe = read_csv(url, header=None)
dataframe = dataframe.rename(columns={0: "age", 1: "year", 2:"nodes",3:"survival"})
dataframe
| age | year | nodes | survival | |
|---|---|---|---|---|
| 0 | 30 | 64 | 1 | 0 |
| 1 | 30 | 62 | 3 | 0 |
| 2 | 30 | 65 | 0 | 0 |
| 3 | 31 | 59 | 2 | 0 |
| 4 | 31 | 65 | 4 | 0 |
| ... | ... | ... | ... | ... |
| 301 | 75 | 62 | 1 | 0 |
| 302 | 76 | 67 | 0 | 0 |
| 303 | 77 | 65 | 3 | 0 |
| 304 | 78 | 65 | 1 | 1 |
| 305 | 83 | 58 | 2 | 1 |
306 rows × 4 columns
dataframe.describe()
| age | year | nodes | survival | |
|---|---|---|---|---|
| count | 306.000000 | 306.000000 | 306.000000 | 306.000000 |
| mean | 52.457516 | 62.852941 | 4.026144 | 1.264706 |
| std | 10.803452 | 3.249405 | 7.189654 | 0.441899 |
| min | 30.000000 | 58.000000 | 0.000000 | 1.000000 |
| 25% | 44.000000 | 60.000000 | 0.000000 | 1.000000 |
| 50% | 52.000000 | 63.000000 | 1.000000 | 1.000000 |
| 75% | 60.750000 | 65.750000 | 4.000000 | 2.000000 |
| max | 83.000000 | 69.000000 | 52.000000 | 2.000000 |
# Check NA values
dataframe.isnull().sum().max()
0
values = dataframe.values
X, y = values[:, :-1], values[:, -1]
n_rows = X.shape[0]
n_cols = X.shape[1]
classes = unique(y)
n_classes = len(classes)
print('N Examples: %d' % n_rows)
print('N Inputs: %d' % n_cols)
print('N Classes: %d' % n_classes)
print('Classes: %s' % classes)
print('Class Breakdown:')
breakdown = ''
for c in classes:
total = len(y[y == c])
ratio = (total / float(len(y))) * 100
print(' - Class %s: %d (%.5f%%)' % (str(c), total, ratio))
N Examples: 306 N Inputs: 3 N Classes: 2 Classes: [1 2] Class Breakdown: - Class 1: 225 (73.52941%) - Class 2: 81 (26.47059%)
dataframe_counts = dataframe["survival"].value_counts().to_frame().reset_index().rename(columns={"index": "class", "survival": "count"})
dataframe_counts
| class | count | |
|---|---|---|
| 0 | 1 | 225 |
| 1 | 2 | 81 |
import plotly.express as px
fig = px.bar(dataframe_counts, x="class", y="count", title="Count per Class", text='count')
fig.update_xaxes(type = 'category')
fig.show()
dataframe.head()
| age | year | nodes | survival | |
|---|---|---|---|---|
| 0 | 30 | 64 | 1 | 1 |
| 1 | 30 | 62 | 3 | 1 |
| 2 | 30 | 65 | 0 | 1 |
| 3 | 31 | 59 | 2 | 1 |
| 4 | 31 | 65 | 4 | 1 |
fig = px.box(dataframe, x="survival", y="age",title="Boxplot for Age per Class")
fig.show()
fig = px.box(dataframe, x="survival", y="nodes",title="Boxplot for Nodes per Class")
fig.show()
summary = dataframe.groupby(['year','survival']).size().to_frame().reset_index().rename(columns={0: "count"})
summary.head()
| year | survival | count | |
|---|---|---|---|
| 0 | 58 | 1 | 24 |
| 1 | 58 | 2 | 12 |
| 2 | 59 | 1 | 18 |
| 3 | 59 | 2 | 9 |
| 4 | 60 | 1 | 24 |
fig = px.bar(summary, x="year", y="count",color="survival", barmode="group", facet_col="survival",
text='count',title="Count per Year per Class",
category_orders={"survival": ["1", "2"]})
fig.update_xaxes(type = 'category')
fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.bar(summary, x="year", y="count",color="survival", barmode="group",
text='count',title="Count per Year per Class",
category_orders={"survival": ["1", "2"]})
fig.update_xaxes(type = 'category')
fig.update(layout_coloraxis_showscale=False)
fig.show()
import plotly.figure_factory as ff
import numpy as np
hist_data = [dataframe["year"]]
group_labels = ['year']
fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title="Distribution of Year")
fig.show()
import plotly.figure_factory as ff
import numpy as np
hist_data = [dataframe["age"]]
group_labels = ['age']
fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title="Distribution of Age")
fig.show()
hist_data = [dataframe["nodes"]]
group_labels = ['nodes'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title="Distribution of Nodes")
fig.show()
import plotly.express as px
fig = px.scatter_matrix(dataframe)
fig.update_layout(title="Scatter Matrix Haberman Survival")
fig.show()
import statsmodels.api as sm
import pandas as pd
indep = pd.get_dummies(dataframe['year'], drop_first=True)
indep["nodes"] = dataframe["nodes"]
indep["age"] = dataframe["age"]
dep = pd.get_dummies(dataframe['survival'], drop_first=True)
import statsmodels.api as sm
X = indep
y = dep
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model
model.summary()
| Dep. Variable: | 2 | R-squared (uncentered): | 0.362 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared (uncentered): | 0.334 |
| Method: | Least Squares | F-statistic: | 12.79 |
| Date: | Thu, 04 Nov 2021 | Prob (F-statistic): | 3.30e-22 |
| Time: | 15:30:36 | Log-Likelihood: | -162.09 |
| No. Observations: | 306 | AIC: | 350.2 |
| Df Residuals: | 293 | BIC: | 398.6 |
| Df Model: | 13 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| 59 | 0.0352 | 0.099 | 0.357 | 0.721 | -0.159 | 0.230 |
| 60 | -0.1270 | 0.096 | -1.324 | 0.186 | -0.316 | 0.062 |
| 61 | -0.2112 | 0.103 | -2.048 | 0.041 | -0.414 | -0.008 |
| 62 | -0.0326 | 0.107 | -0.305 | 0.761 | -0.243 | 0.178 |
| 63 | -0.0493 | 0.096 | -0.512 | 0.609 | -0.239 | 0.140 |
| 64 | -0.0188 | 0.096 | -0.197 | 0.844 | -0.207 | 0.169 |
| 65 | 0.1314 | 0.100 | 1.309 | 0.192 | -0.066 | 0.329 |
| 66 | -0.0902 | 0.100 | -0.901 | 0.368 | -0.287 | 0.107 |
| 67 | -0.1599 | 0.104 | -1.532 | 0.127 | -0.365 | 0.046 |
| 68 | -0.0846 | 0.135 | -0.625 | 0.532 | -0.351 | 0.182 |
| 69 | 0.0355 | 0.138 | 0.257 | 0.798 | -0.237 | 0.308 |
| nodes | 0.0177 | 0.003 | 5.292 | 0.000 | 0.011 | 0.024 |
| age | 0.0046 | 0.001 | 3.969 | 0.000 | 0.002 | 0.007 |
| Omnibus: | 36.692 | Durbin-Watson: | 1.169 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 44.292 |
| Skew: | 0.904 | Prob(JB): | 2.41e-10 |
| Kurtosis: | 2.550 | Cond. No. | 490. |
dataframe["survival"] = pd.get_dummies(dataframe['survival'], drop_first=True)
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataframe, train_size=0.75, random_state=1)
import statsmodels.api as sm
import pandas as pd
# defining the dependent and independent variables
Xtrain = train.loc[:, train.columns != "survival"]
ytrain = train["survival"]
# building the model and fitting the data
log_reg = sm.Logit(ytrain, Xtrain).fit();
Optimization terminated successfully.
Current function value: 214.180170
Iterations 5
print(log_reg.summary());
Logit Regression Results
==============================================================================
Dep. Variable: survival No. Observations: 229
Model: Logit Df Residuals: 226
Method: MLE Df Model: 2
Date: Thu, 04 Nov 2021 Pseudo R-squ.: inf
Time: 15:30:47 Log-Likelihood: -49047.
converged: True LL-Null: 0.0000
Covariance Type: nonrobust LLR p-value: 1.000
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
age 0.0032 0.015 0.220 0.826 -0.026 0.032
year -0.0242 0.013 -1.904 0.057 -0.049 0.001
nodes 0.0870 0.024 3.665 0.000 0.040 0.133
==============================================================================
Xtest = test.loc[:, train.columns != "survival"]
ytest = test["survival"]
# performing predictions on the test datdaset
yhat = log_reg.predict(Xtest)
prediction = list(map(round, yhat))
# comparing original and predicted values of y
print('Actual values', list(ytest.values))
print('Predictions :', prediction)
Actual values [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0] Predictions : [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
from sklearn.metrics import (confusion_matrix, accuracy_score)
# confusion matrix
cm = confusion_matrix(ytest, prediction)
print ("Confusion Matrix : \n", cm)
# accuracy score of the model
print('Test accuracy = ', accuracy_score(ytest, prediction))
Confusion Matrix : [[54 4] [15 4]] Test accuracy = 0.7532467532467533
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve, auc
clf = LogisticRegression(random_state=0).fit(Xtrain, ytrain)
y_score = clf.decision_function(Xtest)
pred_prob1 = clf.predict_proba(Xtest)
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(ytest, pred_prob1[:,1], pos_label=1)
# fpr2, tpr2, thresh2 = roc_curve(ytest, pred_prob2[:,1], pos_label=1)
# roc curve for tpr = fpr
random_probs = [0 for i in range(len(ytest))]
p_fpr, p_tpr, _ = roc_curve(ytest, random_probs, pos_label=1)
# auc scores
auc_score1 = roc_auc_score(ytest, pred_prob1[:,1])
# auc_score2 = roc_auc_score(ytest, pred_prob2[:,1])
print("Auc score Logistic Regression:", auc_score1)
Auc score Logistic Regression: 0.8103448275862069
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')
# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
# plt.plot(fpr2, tpr2, linestyle='--',color='green', label='KNN')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.legend(loc='best')
# plt.savefig('ROC',dpi=300)
plt.show();
See above figures.